{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import networkx as nx\n",
    "import pydot\n",
    "import graphviz\n",
    "from networkx.drawing.nx_pydot import graphviz_layout\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "import math\n",
    "from tqdm import tqdm\n",
    "import threading\n",
    "import pickle\n",
    "\n",
    "data_dir = '../data/'\n",
    "trace_data = os.path.join(data_dir, 'training_data', '2020_05_04', 'trace')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = os.listdir(trace_data)\n",
    "trace_df = [pd.read_csv(trace_data + os.sep + f) for f in filenames]\n",
    "\n",
    "for i, df in enumerate(trace_df):\n",
    "    if df['callType'].iloc[0] == 'JDBC':\n",
    "      df['serviceName'] = df['dsName']\n",
    "      df = df.drop(['dsName'], axis=1)\n",
    "    elif df['callType'].iloc[0] == 'LOCAL':\n",
    "      df = df.drop(['dsName'], axis=1)\n",
    "    trace_df[i] = df\n",
    "\n",
    "trace_df = pd.concat(trace_df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped_traces = tuple(trace_df.groupby('traceId'))\n",
    "del trace_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_trace(trace):\n",
    "    ids = trace[trace['callType'] == 'CSF']\n",
    "    relationship = {}\n",
    "\n",
    "    def parse(row):\n",
    "        # parent -> child\n",
    "        if row['pid'] in ids:\n",
    "            relationship[row['pid']] = row['cmdb_id']\n",
    "            \n",
    "    def apply(row):\n",
    "        # parent -> new_parent\n",
    "        if row['callType'] != 'CSF':\n",
    "            return row\n",
    "        else:\n",
    "            if row['id'] in relationship:\n",
    "                row['cmdb_id'] = relationship[row['id']]\n",
    "            return row\n",
    "\n",
    "    trace.apply(parse, axis=1)\n",
    "    return trace.apply(apply, axis=1)\n",
    "\n",
    "\n",
    "def trace_graph(trace, prev_graph):\n",
    "    DG = nx.DiGraph(prev_graph)\n",
    "    \n",
    "    hosts = trace['cmdb_id'].unique()\n",
    "    services = trace['serviceName'].unique()\n",
    "\n",
    "    # Add nodes to the graph\n",
    "    for node in hosts:\n",
    "        DG.add_node(node, type='host')\n",
    "    \n",
    "    for node in services:\n",
    "        DG.add_node(node, type='service')\n",
    "\n",
    "    # Add edges to the graph\n",
    "    for _, row in trace.iterrows():\n",
    "        parent = trace[trace['id'] == row['pid']]['serviceName']\n",
    "        service = row['serviceName']\n",
    "        host = row['cmdb_id']\n",
    "    \n",
    "        # Parent service to current service\n",
    "        if(len(parent)): # Parent may be empty\n",
    "            DG.add_edge(parent.values[0], service)\n",
    "        \n",
    "        # Current service to its host\n",
    "        DG.add_edge(service, host)\n",
    "\n",
    "    return DG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# num_threads = os.cpu_count()\n",
    "# data = [{} for _ in range(num_threads)]\n",
    "\n",
    "# def function(idx, grouped_traces):\n",
    "#     print('Entered')\n",
    "#     for index, t in enumerate(grouped_traces):\n",
    "#         DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)\n",
    "#         DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)    \n",
    "\n",
    "#         trace_dgs = data[idx]\n",
    "        \n",
    "#         if DG_hash not in trace_dgs:\n",
    "#             trace_dgs.update({DG_hash : DG})\n",
    "\n",
    "#         if index > 10:\n",
    "#             break\n",
    "\n",
    "#         if index % 1000 == 0: \n",
    "#             print(idx, ':', index)\n",
    "#     print('Left')\n",
    "    \n",
    "\n",
    "# step_size = round(len(grouped_traces) / num_threads + 0.5)\n",
    "# threads = []\n",
    "# for i in range(num_threads):\n",
    "#     print(i)\n",
    "#     t = threading.Thread(target=function, args=(i, grouped_traces[i*step_size:(i+1)*step_size]))\n",
    "#     t.start()\n",
    "#     threads.append(t)\n",
    "\n",
    "# for t in threads:\n",
    "#     t.join()"
   ]
  },
  {
   "source": [
    "# num_threads = os.cpu_count()\n",
    "# data = [{} for _ in range(num_threads)]\n",
    "data = {} \n",
    "\n",
    "for i, t in tqdm(enumerate(grouped_traces), total=len(grouped_traces)):\n",
    "    DG = trace_graph(trace=process_trace(t[1]), prev_graph=None)\n",
    "    DG_hash = nx.algorithms.graph_hashing.weisfeiler_lehman_graph_hash(DG)\n",
    "\n",
    "    if DG_hash not in data:\n",
    "        data.update({DG_hash : DG})\n",
    "\n",
    "# Save \n",
    "data_filename = 'unique_graphs.pickle'\n",
    "with open(data_filename, 'wb') as f:\n",
    "    pickle.dump(data, f)\n",
    "\n",
    "print('\\n'+('-'*40))\n",
    "print('Saved pickle file')\n",
    "print('Number of graphs:',  len(data.keys()))\n",
    "print('Filename:', data_filename)\n",
    "print('-'*40)\n",
    "\n",
    "# print(data, len(data.keys()))\n"
   ],
   "cell_type": "code",
   "metadata": {},
   "execution_count": 25,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "  0%|          | 100/730041 [00:03<7:47:20, 26.03it/s]\n",
      "----------------------------------------\n",
      "Saved pickle file\n",
      "Number of graphs: 5\n",
      "Filename: Z:unique_graphs.pickle\n",
      "----------------------------------------\n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "{'ddcc80605bb7964484afd458d64f553c': <networkx.classes.digraph.DiGraph object at 0x000001676D4CA320>, '7e626134855656d21edad7ebea80e3b2': <networkx.classes.digraph.DiGraph object at 0x000001676D637A90>, '4bc5a73e0f726b9a3d336b8ca1d58c06': <networkx.classes.digraph.DiGraph object at 0x000001676D3CA4E0>, '31ea5d3843487227dbfbdfe7727d716f': <networkx.classes.digraph.DiGraph object at 0x000001676D1F8828>, '1572870b16a9c8bdae01f7446fc40cdf': <networkx.classes.digraph.DiGraph object at 0x000001676D1F8320>}\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# with open(data_filename, 'rb') as f:\n",
    "#     final_data_recovered = pickle.load(f)\n",
    "\n",
    "# print(final_data_recovered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# final_data = {}\n",
    "# for d in data:\n",
    "#     for k in d.keys():\n",
    "#         if k not in final_data.keys():\n",
    "#             final_data[k] = d[k]\n",
    "\n",
    "# print(final_data, len(final_data.keys()))\n",
    "\n",
    "\n",
    "# with open(data_filename, 'wb') as f:\n",
    "#     pickle.dump(final_data, f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6 anm-project",
   "language": "python",
   "name": "anm-project"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}